In the previous step we gather all known cancer hotspots by scavenging calls by - checking for overlap with amino acid positions in a curated and published cancer hotspot database - checking for overlap with non-coding region hotspots mutations , here we are using the in TERT promoter region
In this notebook, we will create consensus mafs from the filtered call scratch/hotspot-detection/*RDS. Since there are slight differences in read support from each caller, we will take a mean value to provide the information per site in the consensus maf file. All other vcf2maf columns will be added from strelka/mutect2
library("tidyverse")
library("maftools")
root_dir <- rprojroot::find_root(rprojroot::has_dir(".git"))
data_dir <- file.path(root_dir, "data")
results_dir <- file.path(root_dir,
"analyses",
"hotspots-detection",
"results")
# Make output folder
if (!dir.exists(results_dir )) {
dir.create(results_dir , recursive = TRUE)
}
scratch_dir <- file.path(
root_dir,
"scratch",
"hotspot-detection"
)
# combined hotspot calls
combined_hotspot_calls <- list.files(path = scratch_dir,pattern = ".RDS",full.names = TRUE) %>%
map(readRDS) %>%
bind_rows()
# hotspot mutation maf file output
output_file <- file.path(results_dir,"pbta-snv-hotspots-mutation.maf.tsv.gz")
# PBTA consensus maf
maf_coltypes <- readRDS(file.path("input","maf_coltypes.RDS"))
consensus_maf_file <- read_tsv(file.path(data_dir,"pbta-snv-consensus-mutation.maf.tsv.gz"),
col_types = readr::cols(Entrez_Gene_Id = readr::col_character(),
ALLELE_NUM = readr::col_character(),
vcf_pos = readr::col_character(),
vcf_qual = readr::col_number()))
1270622 parsing failures.
row col expected actual file
1 vcf_qual a number . '/home/rstudio/OpenPBTA/data/pbta-snv-consensus-mutation.maf.tsv.gz'
2 vcf_qual a number . '/home/rstudio/OpenPBTA/data/pbta-snv-consensus-mutation.maf.tsv.gz'
3 vcf_qual a number . '/home/rstudio/OpenPBTA/data/pbta-snv-consensus-mutation.maf.tsv.gz'
4 vcf_qual a number . '/home/rstudio/OpenPBTA/data/pbta-snv-consensus-mutation.maf.tsv.gz'
5 vcf_qual a number . '/home/rstudio/OpenPBTA/data/pbta-snv-consensus-mutation.maf.tsv.gz'
... ........ ........ ...... ....................................................................
See problems(...) for more details.
The main tasks are: - calculate a mean of read supports for REF and ALT from multiple callers which can be slightly different for the consenus maf file - create a maf format output to be appended on to the 3/3 consensus maf file
unique_cols <- c("t_depth",
"n_depth",
"t_ref_count",
"n_ref_count",
"t_alt_count",
"n_alt_count",
"caller",
"vcf_qual")
combined_hotspot_calls <- combined_hotspot_calls %>%
unique() %>%
group_by_at(setdiff(names(combined_hotspot_calls), unique_cols)) %>%
summarise(
t_depth= mean(as.numeric(t_depth)),
n_depth = mean(as.numeric(n_depth)),
t_ref_count = mean(as.numeric(t_ref_count)),
n_ref_count = mean(as.numeric(n_ref_count)),
t_alt_count = mean(as.numeric(t_alt_count)),
n_alt_count = mean(as.numeric(n_alt_count)),
caller_count = n(),
caller = toString(caller),
vcf_qual = mean(as.numeric(vcf_qual))) %>%
ungroup() %>%
mutate(VAF=t_alt_count/(t_ref_count+t_alt_count))
We will try to identify the read support for each combination of callers to visualize if any callers call hostspot sites with low read support calls which
ggplot(combined_hotspot_calls[,c("caller","caller_count","t_alt_count")],
aes(x=as.factor(caller_count),y=t_alt_count))+
geom_violin()+
ggpubr::stat_compare_means()
Hotspot sites called by just 1 caller seem to have majority low read support. Lets look at these calls that are unique to 1 caller.
unique_1caller_hotspots<- combined_hotspot_calls %>%
filter(caller_count==1)
unique_1caller_hotspots
What’s the distrbution of the t_alt_count
summary.caller <- unique_1caller_hotspots %>%
group_by(caller) %>%
tally()
ggplot(unique_1caller_hotspots[,c("caller","t_alt_count")],
aes(x=as.factor(caller),y=t_alt_count))+
geom_violin()+
geom_jitter(height = 0, width = 0.1) +
# add number of calls at the top
geom_text(data=summary.caller ,aes(x = caller, y = 350, label=n), fontface =2, size = 4)+
# draw a line where t_alt_count==5
geom_hline(aes(yintercept=5,color="red"))+
theme(legend.position="none")
NA
NA
Strelka has majority of the unique calls with a wide distribution of t_alt_count. Some calls from lancet,mutect2 are below the red line which denotes t_alt_count==5.
We will first filter combined_hotspot_calls to keep only rows that are not found in consensus_maf_file
check_with_consensus<- consensus_maf_file %>%
mutate("consensus"="Yes") %>%
select(-one_of(unique_cols),-VAF,-vcf_pos)
Unknown columns: `caller`
combined_hotspot_calls %>%
select(-caller_count,-caller,-Amino_Acid_Position,-unique_cols,-VAF) %>%
mutate(hotspot="Yes") %>%
left_join(check_with_consensus) %>%
filter(is.na(consensus))
Joining, by = c("Hugo_Symbol", "Entrez_Gene_Id", "Center", "NCBI_Build", "Chromosome", "Start_Position", "End_Position", "Strand", "Variant_Classification", "Variant_Type", "Reference_Allele", "Tumor_Seq_Allele1", "Tumor_Seq_Allele2", "dbSNP_RS", "dbSNP_Val_Status", "Tumor_Sample_Barcode", "Matched_Norm_Sample_Barcode", "Match_Norm_Seq_Allele1", "Match_Norm_Seq_Allele2", "Tumor_Validation_Allele1", "Tumor_Validation_Allele2", "Match_Norm_Validation_Allele1", "Match_Norm_Validation_Allele2", "Verification_Status", "Validation_Status", "Mutation_Status", "Sequencing_Phase", "Sequence_Source", "Validation_Method", "Score", "BAM_File", "Sequencer", "Tumor_Sample_UUID", "Matched_Norm_Sample_UUID", "HGVSc", "HGVSp", "HGVSp_Short", "Transcript_ID", "Exon_Number", "all_effects", "Allele", "Gene", "Feature", "Feature_type", "Consequence", "cDNA_position", "CDS_position", "Protein_position", "Amino_acids", "Codons", "Existing_variation", "ALLELE_NUM", "DISTANCE", "STRAND_VEP", "SYMBOL", "SYMBOL_SOURCE", "HGNC_ID", "BIOTYPE", "CANONICAL", "CCDS", "ENSP", "SWISSPROT", "TREMBL", "UNIPARC", "RefSeq", "SIFT", "PolyPhen", "EXON", "INTRON", "DOMAINS", "AF", "AFR_AF", "AMR_AF", "ASN_AF", "EAS_AF", "EUR_AF", "SAS_AF", "AA_AF", "EA_AF", "CLIN_SIG", "SOMATIC", "PUBMED", "MOTIF_NAME", "MOTIF_POS", "HIGH_INF_POS", "MOTIF_SCORE_CHANGE", "IMPACT", "PICK", "VARIANT_CLASS", "TSL", "HGVS_OFFSET", "PHENO", "MINIMISED", "ExAC_AF", "ExAC_AF_AFR", "ExAC_AF_AMR", "ExAC_AF_EAS", "ExAC_AF_FIN", "ExAC_AF_NFE", "ExAC_AF_OTH", "ExAC_AF_SAS", "GENE_PHENO", "FILTER", "flanking_bps", "vcf_id", "ExAC_AF_Adj", "ExAC_AC_AN_Adj", "ExAC_AC_AN", "ExAC_AC_AN_AFR", "ExAC_AC_AN_AMR", "ExAC_AC_AN_EAS", "ExAC_AC_AN_FIN", "ExAC_AC_AN_NFE", "ExAC_AC_AN_OTH", "ExAC_AC_AN_SAS", "ExAC_FILTER", "gnomAD_AF", "gnomAD_AFR_AF", "gnomAD_AMR_AF", "gnomAD_ASJ_AF", "gnomAD_EAS_AF", "gnomAD_FIN_AF", "gnomAD_NFE_AF", "gnomAD_OTH_AF", "gnomAD_SAS_AF")